# setup and load packages
knitr::opts_chunk$set(
    echo = TRUE, message = FALSE, warning = FALSE, fig.path = "Convergence_plots/",  dev = "svg"
)
library(ggplot2) # for plotting
suppressPackageStartupMessages( 
  library(cowplot) # pretty ggplot
)
library(reshape2) # rearranging/'melt'-ing tables
library(plyr) # summarising (pivot-table-style) tables 

Convergent sequences

Heavy chains - Same V gene, Same J gene, Same CDR3 length, 85% CDR3 identity

CDR3s with length > 30 AAs & unproductive sequences are ignored here for simplicity. Considered here the following:

convg_data <- test_data[which(test_data$Num_AAs <= 30 & 
                               test_data$V.DOMAIN.Functionality == "productive"), ]
convg_data <- convg_data[which(convg_data$SampleType2 %in% c("Healthy",
                                                             "COVID-19",
                                                             "COVID-19Recovered")), ]

# first take list of IgH that binds Spike - definite ones from PDB:
pdb_ab <- read.table('known_binders/20210519ver_sabdab_SARSCoV2Abs_sequences_VJgenes.tsv', 
                     sep = "\t", header = TRUE, stringsAsFactors = FALSE)
# take only CDR3, ie removing the canonical C and W that flanks CDR3
pdb_ab$CDRH3 <- substr(pdb_ab$CDRH3, 2, nchar(pdb_ab$CDRH3)-1) 
pdb_ab$struct_id <- apply(pdb_ab[, 1:2], MARGIN = 1, paste, collapse = "|")
# pdb_ab$Specificity <- "PDB"
pdb_ab <- ddply(pdb_ab, c("CDRH3", "Specificity", "Vgene", "Jfamily"), summarise,
                  fasta_header = paste(struct_id, collapse = ","),
                  molecule_name = paste(unique(molecule_name), collapse = ","))
pdb_ab$source <- "PDB"
pdb_ab$molecule_name <- apply(pdb_ab[, c("fasta_header", "molecule_name")], 
                                MARGIN = 1, function(x) paste(x[1], x[2], sep = ": "))
pdb_ab <- pdb_ab[, -which(colnames(pdb_ab) == "fasta_header")]

# also read in known binders determined from experiments, obtained from publications
cloned_ab_pub <- c("known_binders/Dugan_et_al.txt",
                  "known_binders/Brower_et_al.txt",
                  "known_binders/Kreer_et_al.txt",
                  "known_binders/Robbiani_et_al.txt",
                  "known_binders/Gaebler_et_al.txt",
                  "known_binders/Zost_et_al.txt")
cloned_ab_pub <- lapply(cloned_ab_pub, read.table, stringsAsFactors=FALSE, 
                       sep = "\t", header = TRUE, comment.char = "#")
cloned_ab_pub[[1]]$CDRH3 <- substr(cloned_ab_pub[[1]]$CDRH3, 2,
                                  nchar(cloned_ab_pub[[1]]$CDRH3)-1) 
cloned_ab_pub[[2]]$CDRH3 <- substr(cloned_ab_pub[[2]]$CDRH3, 2,
                                  nchar(cloned_ab_pub[[2]]$CDRH3)-1) 
cloned_ab_pub[[3]]$CDRH3 <- substr(cloned_ab_pub[[3]]$CDRH3, 2,
                                  nchar(cloned_ab_pub[[3]]$CDRH3)-1) 
cloned_ab_pub[[4]]$Specificity <- "Spike/RBD"
cloned_ab_pub[[5]]$Specificity <- "Spike/RBD"
cloned_ab_pub[[1]]$source <- "Dugan_et_al"
cloned_ab_pub[[2]]$source <- "Brouwer_et_al"
cloned_ab_pub[[3]]$source <- "Kreer_et_al"
cloned_ab_pub[[4]]$source <- "Robbiani_et_al"
cloned_ab_pub[[5]]$source <- "Gaebler_et_al"
cloned_ab_pub[[6]]$source <- "Zost_et_al"
cloned_ab_pub <- do.call("rbind", cloned_ab_pub)
cloned_ab_pub <- ddply(cloned_ab_pub, c("CDRH3", "Specificity", "Vgene", "Jfamily"),
                       summarise,
                       molecule_name = paste(sort(unique(Antibody)), collapse = ","),
                       source = paste(sort(unique(source)), collapse = ","))
cloned_ab <- rbind(pdb_ab, cloned_ab_pub)
cloned_ab <- ddply(cloned_ab, c("CDRH3", "Specificity", "Vgene", "Jfamily"), summarise,
                   molecule_name = paste(sort(unique(molecule_name)), collapse = ","),
                   source = paste(sort(unique(source)), collapse = ","))
cloned_ab$Num_AAs <- sapply(cloned_ab$CDRH3, nchar) 

# group sequences by same V and J genes
convg_data <- convg_data[, c("AA_CDR3_edited", "SampleType2", "Vgene", "Jfamily",
                             "Seq_ID", "SampleType2", "Num_AAs")]
colnames(convg_data) <- colnames(cloned_ab)
convg_data <- rbind(convg_data, cloned_ab)
convg_data <- split(convg_data, f = list(convg_data$Vgene, convg_data$Jfamily,
                                         convg_data$Num_AAs))
convg_data <- convg_data[sapply(convg_data, nrow) > 1]
# for each V-J combination, calculate % identity, take >= 85%
converge <- lapply(names(convg_data), function(x){
  tb <- convg_data[[x]]
  print(x)
  sims <- stringdist::stringsimmatrix(tb$CDRH3)
  sims[lower.tri(sims)] <- NA
  sims <- reshape2::melt(sims)
  sims <- sims[which(!is.na(sims[, 3])), ]
  sims <- sims[which(sims[, 1] != sims[, 2]), ]
  sims <- sims[which(sims[, 3] >= 0.85), ]
  sims[, 1] <- factor(sims[, 1], levels = 1:nrow(tb), labels = tb$molecule_name)
  sims[, 2] <- factor(sims[, 2], levels = 1:nrow(tb), labels = tb$molecule_name)
  sims
})
converge <- converge[sapply(converge, function(x) nrow(x) > 0)]
converge <- do.call("rbind", converge)

# map back metadata about the sequences
convg_data <- do.call("rbind", convg_data)
converge <- merge(converge, convg_data[, c("molecule_name", "CDRH3", "Specificity",
                                           "Vgene","Jfamily")],
                  by.x = "Var1", by.y = "molecule_name", all.x = TRUE, all.y = FALSE,
                  sort = FALSE)
colnames(converge)[4:7] <- paste0("Seq1_", colnames(converge)[4:7]) 
converge <- merge(converge, convg_data[, c("molecule_name", "CDRH3", "Specificity",
                                           "Vgene","Jfamily")],
                  by.x = "Var2", by.y = "molecule_name", all.x = TRUE, all.y = FALSE,
                  sort = FALSE)
colnames(converge)[8:11] <- paste0("Seq2_", colnames(converge)[8:11]) 
colnames(converge)[1:3] <- c("Seq2", "Seq1", "identity")

# retain only the sequences coming from different patients
converge$Seq1_PatientID <- sapply(as.character(converge$Seq1), 
                                  function(x) gsub("D[0-9]*$", "", 
                                                   unlist(strsplit(x, split = "_"))[1],
                                                   perl = TRUE))
converge$Seq2_PatientID <- sapply(as.character(converge$Seq2), 
                                  function(x) gsub("D[0-9]*$", "", 
                                                   unlist(strsplit(x, split = "_"))[1],
                                                   perl = TRUE))
converge <- converge[which(converge$Seq1_PatientID != converge$Seq2_PatientID), ]
saveRDS(converge, "ConvergeSequences_SpikeAbPDB_graph_SameVJPatient.rds")

Known binders

Added to this set of sequences known binders of SARS-CoV-2 proteins from the following sources:

  • CDR3s from antibody structures resolved in complex with SARS-CoV-2 proteins (Search PDBe 21st May 2021). All structures are binding the Spike protein (but could be targeting different domains/epitopes). *. Cloned & Experimentally confirmed binders to SARS-CoV-2 proteins from these publications:
  1. (Brouwer et al. 2020, Table S1) - S protein specific. Only took those with detected binding to RBD / Spike (AUC > 1, ref (Brouwer et al. 2020) Figure 4) (link to article)
  2. (Robbiani et al. 2020, Supplementary Table S5, S6) (PI Michel Nussenzweig) - RBD protein specific (link to article)
  3. (Gaebler et al. 2021, Supplementary Table S5, S6) (follow-up of Robbiani et al. (2020)) - RBD protein specific (link to article)
  4. (Kreer et al. 2020, GenBank entries) (PI Florian Klein) - SARS-CoV-2 neutralizing antibodies (link to article)
  5. (Dugan et al. 2021, Table S7 & Supplementary Data) (PI Patrick Wilson) - S, NP, ORF8 specific B cells; 10X VDJ with paired Abs (link to article)
  6. (Zost et al. 2020, Supplementary Table 4) - workflow to isolate mAbs specific against Spike, Spike/RBD and Spike/NTD, and those cross-reactive (ie reactive to both SARS-CoV and SARS-CoV-2 S protein) (link to article)

If V & J germline gene assignment are missing from data source they are annotated using either IMGT/High-VQuest (if nucleotide sequences are available) or IMGT/DomainGapAlign (if only amino acid sequences are available). Total: 822 unique CDRH3 sequences from PDB and publications.

Evaluate CDRH3 AA identity, and construct a network connecting sequences (from data / known binders). Edges are drawn between pairs of sequences using identical rules as in connecting the convergent repertoire sequences.

converge <- readRDS("ConvergeSequences_SpikeAbPDB_graph_SameVJPatient.rds")
convg_data <- do.call("rbind", convg_data)

# process in graph object
converge_graph <- igraph::graph_from_data_frame(converge, directed = FALSE)

# the graph is very disconnected
components <- igraph::components(converge_graph)

# size distribution of components
ggplot(data.frame(components$csize)) + geom_histogram(aes(components.csize)) +
  scale_x_log10(name = "Cluster size") + cowplot::theme_cowplot() + 
  ggtitle("Size distribution of clusters") + ylab("Number of clusters")

# components with at least 10 sequences - makeup breakdown by COVID/Healthy/YFV/PDB
membership <- data.frame(components$membership)
membership$Seq_ID <- rownames(membership)
colnames(membership)[1] <- "cluster"
membership <- list(
  membership, 
  data.frame(cluster = 1:length(components$csize), cluster_size = components$csize)
)
membership <- merge(membership[[1]], membership[[2]], by = "cluster",
                    all.x = TRUE, all.y = FALSE, sort = FALSE)
converge_nodes <- list(
  converge[, which(grepl("^Seq1", colnames(converge)))],
  converge[, which(grepl("^Seq2", colnames(converge)))]
)
converge_nodes <- unique(do.call("rbind", lapply(converge_nodes, function(tb){
  colnames(tb) <- c("Seq_ID", "CDRH3", "Specificity", "Vgene", "Jfamily", "PatientID")
  tb
})))
membership <- merge(membership, converge_nodes, 
                    all.x = TRUE, all.y = FALSE, sort= FALSE)
membership$SampleType <- factor(membership$Specificity,
                                levels = c("COVID-19", "Healthy", "COVID-19Recovered",
                                           "PDB", "Spike", "Spike/RBD"),
                                labels = c("Repertoire", "Repertoire", "Repertoire",
                                           "Binders", "Binders", "Binders"))
# those clusters with both repertoire sequences and known binders
membership_both <- ddply(membership, "cluster", summarise,
                         both = ("Repertoire" %in% SampleType & "Binders" %in% SampleType))
membership_both <- membership_both[which(membership_both$both), "cluster"]
membership$both <- (membership$cluster %in% membership_both)

membership <- merge(membership, test_data[, c("Seq_ID", "NumInClone", "CloneGroup", "Age")],
                    by = "Seq_ID", all.x = TRUE, all.y = FALSE, sort = FALSE)
membership$SampleType2 <- membership$Specificity
saveRDS(membership, 'convergent_cluster_membership.rds')
# largest clusters & V/J gene use
cluster_summary <- membership[, c("cluster", "cluster_size", "SampleType2", "Age", "PatientID", "Seq_ID",
                                  "Vgene", "Jfamily", "both")]
cluster_summary$AgeGroup <- sapply(cluster_summary$Age, function(x){
  if(x %in% c("Young", "Old")){
    if(x == "Young") return("leq50")
    if(x == "Old") return("geq60")
  } else if(is.na(x)) return(NA) else {
    x <- as.numeric(x)
    if(x <= 50) return("leq50")
    if(x >= 60) return("geq60")
    return(NA)
  }
  return(NA)
})
cluster_summary <- cluster_summary[order(cluster_summary$cluster_size, decreasing = TRUE), ]
cluster_large <- cluster_summary[which(cluster_summary$cluster_size >= 10), ]
cluster_large$SampleType2 <- replace(cluster_large$SampleType2, which(cluster_large$SampleType2 == "PDB"),
                                     "Spike/RBD")
cluster_large <- ddply(cluster_large, c("cluster", "cluster_size", "Vgene", "Jfamily", "AgeGroup", "SampleType2"),
                       summarise, V1 = length(Seq_ID), n_patient = length(unique(PatientID)))
cluster_large$cluster <- factor(cluster_large$cluster, 
                                levels = unique(cluster_large$cluster[order(cluster_large$cluster_size, 
                                                                            decreasing = TRUE)]))

# n sequences per cluster
g1 <- ggplot(cluster_large, aes(x = cluster, y = V1, fill = SampleType2)) + geom_bar(stat = "identity") +
  cowplot::theme_cowplot() + theme(axis.text.x = element_blank()) + 
  scale_fill_manual(values = c("Healthy" = "#7FC97F", "COVID-19" = "#BEAED4", "Spike" = "#99CCFF", 
                               "COVID-19Recovered" = "#FDC086", "Spike/RBD" = "#0000FF"), name = "",
                    labels = c("CV19", "CV19-Recovered", "Healthy", "Spike", "Spike/RBD")) +
  xlab("Convergent clusters") +ylab("Number of\nsequences") + ggtitle("Clusters with 10 or more sequences (n = 43)")
# Vgene usage
g2 <- ggplot(cluster_large, aes(x = cluster, y = Vgene)) + geom_tile(fill = "black") +
  cowplot::theme_cowplot() + theme(axis.text.x = element_blank(),
                                   axis.ticks.x = element_blank()) + xlab("") +ylab("V usage")
# Jfamily usage
g3 <- ggplot(cluster_large, aes(x = cluster, y = Jfamily)) + geom_tile(fill = "black") +
  cowplot::theme_cowplot() + theme(axis.text.x = element_blank(),
                                   axis.ticks.x = element_blank()) + xlab("") +ylab("J usage")

# number of donors represented (ie with sequences present) in each cluster
g4 <- ggplot(cluster_large[which(!is.na(cluster_large$AgeGroup)), ], 
             aes(x = cluster, y = n_patient, fill = AgeGroup)) + geom_bar(stat = "identity") +
  cowplot::theme_cowplot() + theme(axis.text.x = element_blank()) +
  scale_fill_manual(values = c("leq50" = "grey30", "geq60" = "grey70"),
                    label = c(expression("Age ">=60), expression("Age "<=50)), name = "") +
  xlab("Convergent clusters") +ylab("Number of\ndonors")

cowplot::plot_grid(g1, g2, g3, g4, ncol = 1, axis = "lr", align = "v", rel_heights = c(2, 3, 1, 1))

Clusters of sequences with both repertoire sequences and known binders - what are the targets?

both_targets <- unique(membership[membership$cluster %in% membership_both,
                                  c("Seq_ID", "cluster", "cluster_size", "SampleType2", "Specificity")])
both_targets$Specificity <- replace(both_targets$Specificity,
                                    which(grepl("7cws|7l2e|7m8j|7lcn", both_targets$Seq_ID)),
                                    "Spike/NTD")
both_targets$Specificity <- replace(both_targets$Specificity,
                                    which(both_targets$Specificity == "PDB"),
                                    "Spike/RBD")
both_targets <- ddply(both_targets, c("cluster", "cluster_size", "SampleType2", "Specificity"), nrow)
both_targets <- both_targets[order(both_targets$cluster_size, decreasing = TRUE), ]
both_targets$cluster <- factor(both_targets$cluster, levels = unique(both_targets$cluster))

# n sequences per cluster
g4 <- ggplot(unique(both_targets[, c("cluster", "Specificity", "V1")]), 
             aes(x = cluster, y = V1, fill = Specificity)) + geom_bar(stat = "identity") +
  cowplot::theme_cowplot() + theme(axis.text.x = element_blank()) + 
  scale_fill_manual(values = c("Healthy" = "#7FC97F", "COVID-19" = "#BEAED4",
                               "COVID-19Recovered" = "#FDC086", "Spike/NTD" = "#bd40ac",
                               "Spike" = "#99CCFF", "Spike/RBD" = "#0000FF"),name = "",
                    labels = c("CV19", "CV19-Recovered", "Healthy", "Spike", "Spike/NTD", "Spike/RBD")) +
  xlab("") +ylab("Number of\nsequences") + ggtitle("Clusters with binders (n = 28)")
# targets
g5 <- ggplot(both_targets[which(grepl("Spike", both_targets$Specificity)), ], 
             aes(x = cluster, y = Specificity)) + geom_tile(fill = "black") +
  cowplot::theme_cowplot() + theme(axis.text.x = element_blank(),
                                   axis.ticks.x = element_blank()) + xlab("Convergent clusters") +ylab("")

cowplot::plot_grid(g4, g5, ncol = 1, axis = "lr", align = "v", rel_heights = c(2,1))

Example of convergent clusters

library(visNetwork)
visualiseNetwork <- function(graph, nodes, node_annotation, cluster, node_id_col = "Seq_ID", 
                             node_color_col = "Specificity", 
                             node_colors = c("Healthy" = "#7FC97F", "COVID-19" = "#BEAED4",
                                             "COVID-19Recovered" = "#FDC086", "PDB" = "#000000",
                                             "Spike" = "#99CCFF", "Spike/RBD" = "#0000FF"))
{
  graphData <- visNetwork::toVisNetworkData(igraph::induced_subgraph(graph, nodes))
  graphData$edges$width <- (15 - 1) * ( graphData$edges$identity - 0.85 ) / (1 - 0.85) + 1
  graphData$edges$color <- "grey"
  graphData$nodes <- merge(graphData$nodes, node_annotation, by.x = "id", by.y = node_id_col,
                           all.x = TRUE, all.y = FALSE, sort = FALSE)
  graphData$nodes$label <- NA # no labels
  graphData$nodes$color <- graphData$nodes$Specificity
  for( i in names(node_colors) ){
    graphData$nodes$color <- replace(graphData$nodes$color, 
                                     which(graphData$nodes[, node_color_col] == i),
                                     node_colors[i])
  }
  lnodes <- data.frame(node_colors)
  colnames(lnodes)[1] <- "color"
  lnodes$label <- rownames(lnodes)
  lnodes$color.border <- "white"
  lnodes$title <- ""
  lnodes$id <- 1:nrow(lnodes)
  visNetwork(nodes = graphData$nodes, edges = graphData$edges, main = paste0("Cluster ", cluster)) %>%
    visLegend(addNodes = lnodes, useGroups = FALSE) %>% visIgraphLayout()
}
visualiseNetwork(converge_graph, membership[membership$cluster == 591, 1], cluster = 591,
                 node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 676, 1], cluster = 676,
                 node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 1237, 1], cluster = 1237,
                 node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 1477, 1], cluster = 1477,
                 node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 1887, 1], cluster = 1887,
                 node_annotation = converge_nodes)

CDRH3 AA Sequence logos of these clusters:

seqs <- membership[membership$cluster %in% c(591, 676, 1237, 1477, 1887), c("Seq_ID", "cluster", "CDRH3")]
seqs <- merge(seqs, ddply(seqs, "cluster", nrow), by = "cluster")
seqs$cluster <- paste0("Cluster ", seqs$cluster, "\n(n = ", seqs$V1, ")")

library(ggseqlogo)
cowplot::plot_grid(
  # cluster 591
  ggseqlogo(seqs[which(grepl("Cluster 591", seqs$cluster)), "CDRH3"]) +
    ylab(unique(seqs[which(grepl("Cluster 591", seqs$cluster)), "cluster"])) +
    theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
                            axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
  # cluster 676
  ggseqlogo(seqs[which(grepl("Cluster 676", seqs$cluster)), "CDRH3"]) +
    ylab(unique(seqs[which(grepl("Cluster 676", seqs$cluster)), "cluster"])) +
    theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
                            axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
  # cluster 1239
  ggseqlogo(seqs[which(grepl("Cluster 1237", seqs$cluster)), "CDRH3"]) +
    ylab(unique(seqs[which(grepl("Cluster 1237", seqs$cluster)), "cluster"])) +
    theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(), 
                            axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
  # cluster 1479
  ggseqlogo(seqs[which(grepl("Cluster 1477", seqs$cluster)), "CDRH3"]) +
    ylab(unique(seqs[which(grepl("Cluster 1477", seqs$cluster)), "cluster"])) +
    theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(), 
                            axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
  # cluster 1888
  ggseqlogo(seqs[which(grepl("Cluster 1887", seqs$cluster)), "CDRH3"]) +
    ylab(unique(seqs[which(grepl("Cluster 1887", seqs$cluster)), "cluster"])) +
    theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(), 
                            axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
  align = "v", axis = "lr", ncol = 1
)

Analysis

Expansion

membership$large <- (membership$cluster_size >= 10)
membership$SampleType2 <- replace(membership$Specificity,
                                  which(membership$Specificity == "COVID-19Recovered"),
                                  "COVID-19")
test_data$SampleType3 <- replace(test_data$SampleType2,
                                  which(test_data$SampleType2 == "COVID-19Recovered"),
                                  "COVID-19")
expansion <- ddply(membership, c("SampleType2", "large", "CloneGroup"), nrow)
expansion <- expansion[which(expansion$SampleType2 %in% c("COVID-19", "Healthy")), ]
expansion <- merge(
  expansion, ddply(membership, c("SampleType2", "large"), nrow), all.x = TRUE, all.y = FALSE,
  by = c("SampleType2", "large"), sort = FALSE
)
expansion$perc <- expansion[, 4] / expansion[, 5]
colnames(expansion) <- c("SampleType2", "large", "CloneGroup", "n", "total", "perc")
expansion$type <- "convergent"
expansion <- list(
  expansion, 
  merge(
    ddply(test_data[!test_data$Seq_ID %in% membership$Seq_ID, ], 
          c("SampleType3", "CloneGroup"), nrow), 
    ddply(test_data[!test_data$Seq_ID %in% membership$Seq_ID, ], 
          "SampleType2", nrow), all.x = TRUE, all.y = FALSE,
    by.x = "SampleType3", by.y = "SampleType2", sort = FALSE
  )
)
expansion[[2]]$perc <- expansion[[2]][, 3] / expansion[[2]][, 4]
colnames(expansion[[2]]) <- c("SampleType2", "CloneGroup", "n", "total", "perc")
expansion[[2]] <- expansion[[2]][which(expansion[[2]]$SampleType2 %in% c("COVID-19", "Healthy")), ]
expansion[[2]]$type <- "other sequences"
expansion[[2]]$large <- "others"
expansion[[2]] <- expansion[[2]][, c("SampleType2", "large", "CloneGroup", "n", "total", "perc", "type")]
expansion <- do.call("rbind", expansion)
expansion$CloneGroup <- factor(expansion$CloneGroup,
                               levels = c("Unique", "2", "3", "4&5", "6to9", ">10"),
                               labels = c("Unique", "2", "3", "4-5", "6-9", 
                                          "10 or more"))
expansion$large <- factor(expansion$large, levels = c(TRUE, FALSE, "others"),
                          labels = c("convergent\n(10 or more\nsequences)", "convergent\n(<10 sequences)",
                                     "other\nsequences"))
ggplot(expansion, aes(y = CloneGroup, x = perc)) + 
  geom_bar(stat = "identity", position = position_dodge(), width = 0.5) +
  scale_x_continuous(labels = scales::percent, name = "% sequences") +
  ylab("Number of sequences in clone") + cowplot::theme_cowplot() + 
  facet_grid(large ~ SampleType2) + theme(strip.text = element_text(size = 9))

Another way of presenting this, treating NumInClone as a continuous variable:

expansion_cont <- membership[membership$SampleType2 %in% c("COVID-19", "Healthy"), c("Seq_ID", "large", "NumInClone", "SampleType2")]
expansion_cont <- list(
  expansion_cont,
  test_data[which(!test_data$Seq_ID %in% expansion_cont$Seq_ID & 
              test_data$SampleType2 %in% c("COVID-19", "Healthy")), 
            c("Seq_ID", "SampleType", "NumInClone", "SampleType2")]
)
expansion_cont[[2]][, 2] <- "others"
colnames(expansion_cont[[2]]) <- c("Seq_ID", "large", "NumInClone", "SampleType2")
expansion_cont <- do.call("rbind", expansion_cont)
expansion_cont$large <- factor(expansion_cont$large, levels = c(TRUE, FALSE, "others"),
                          labels = c("convergent\n(10 or moresequences)", "convergent\n(<10 sequences)",
                                     "other sequences"))

ggplot(expansion_cont, aes(x = large, y = NumInClone)) + geom_boxplot(outlier.shape = NA) + 
  facet_grid(~ SampleType2, drop = TRUE) + 
  scale_y_log10(limits = c(1, 500), name = "Number of sequences\nin clone") + cowplot::theme_cowplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("")

# cant make the ggpubr::compare_means add brackets & p-value * properly so here are the data
ggpubr::compare_means(NumInClone ~ large, group.by = "SampleType2", 
                      data = expansion_cont, method = "wilcox", ref.group = "other sequences")
SampleType2 .y. group1 group2 p p.adj p.format p.signif method
Healthy NumInClone other sequences convergent
(10 or moresequences) 0 0 <2e-16 **** Wilcoxon
Healthy NumInClone other sequences convergent
(<10 sequences) 0 0 <2e-16 **** Wilcoxon
COVID-19 NumInClone other sequences convergent
(10 or moresequences) 0 0 <2e-16 **** Wilcoxon
COVID-19 NumInClone other sequences convergent
(<10 sequences) 0 0 <2e-16 **** Wilcoxon

Gene usage

V/J gene usage of large convergent clusters shown above.

Here show relationship between convergent cluster size and the number of patients sharing CDRH3 sequences in the convergent cluster. As expected the larger a cluster the more likely we see more patients sharing sequences in this cluster. Map V/J gene usage onto this plot:

gene_usage <- ddply(membership, c("Jfamily", "Vgene", "cluster", "cluster_size"),
                    summarise,
                    n_patient = length(unique(PatientID[SampleType2 == "COVID-19"])),
                    median_clonesize = median(NumInClone, na.rm = TRUE),
                    lowerci_clonesize = quantile(NumInClone, probs = 0.025, 
                                                 na.rm = TRUE),
                    upperci_clonesize = quantile(NumInClone, probs = 0.975, 
                                                 na.rm = TRUE))

clustersize <- ddply(gene_usage, c("Vgene", "Jfamily"), summarise, 
                     median_clustersize = mean(cluster_size))
clustersize$VJ <- interaction(clustersize$Vgene, clustersize$Jfamily)
clustersize$VJ <- as.character(clustersize$VJ)
gene_usage$VJ <- interaction(gene_usage$Vgene, gene_usage$Jfamily)
gene_usage$VJ <- factor(gene_usage$VJ,
  levels = as.character(clustersize[order(clustersize$median_clustersize, 
                                          decreasing = TRUE), "VJ"])
)

gene_usage$Vgene_colour <- replace(gene_usage$Vgene,
                                   which(!gene_usage$Vgene %in% c("IGHV1-69",
                                       "IGHV3-7", "IGHV3-23", "IGHV3-30", "IGHV3-30-3",
                                       "IGHV3-33", "IGHV3-53","IGHV4-34","IGHV4-39")),
                                   "others")
vgene_colours <- c(scales::hue_pal()(10)[c(1,8,2,9,7,6,4,3,10)])
names(vgene_colours) <-  c("IGHV1-69", "IGHV3-7", "IGHV3-23", "IGHV3-30", "IGHV3-30-3",
                           "IGHV3-33", "IGHV3-53","IGHV4-34","IGHV4-39")
vgene_colours <- c(vgene_colours, "others" = "grey")
ggplot(gene_usage, aes(x = cluster_size, y = n_patient, colour = Vgene_colour)) + 
  geom_point(size = 2) + facet_wrap(~ Jfamily) + cowplot::theme_cowplot() +
  scale_colour_manual(values = vgene_colours, name = "") +
  xlab("Number of sequences in convergent cluster") + 
  ylab("Number of patients sharing clones")

References

Brouwer, Philip J. M., Tom G. Caniels, Karlijn van der Straten, Jonne L. Snitselaar, Yoann Aldon, Sandhya Bangaru, Jonathan L. Torres, et al. 2020. “Potent Neutralizing Antibodies from COVID-19 Patients Define Multiple Targets of Vulnerability.” Science 369 (6504): 643–50. https://doi.org/10.1126/science.abc5902.
Dugan, H. L., C. T. Stamper, L. Li, S. Changrob, N. W. Asby, P. J. Halfmann, N. Y. Zheng, et al. 2021. Profiling B cell immunodominance after SARS-CoV-2 infection reveals antibody evolution to non-neutralizing viral targets.” Immunity, May.
Gaebler, C., Z. Wang, J. C. C. Lorenzi, F. Muecksch, S. Finkin, M. Tokuyama, A. Cho, et al. 2021. Evolution of antibody immunity to SARS-CoV-2.” Nature 591 (7851): 639–44.
Kreer, C., M. Zehner, T. Weber, M. S. Ercanoglu, L. Gieselmann, C. Rohde, S. Halwe, et al. 2020. Longitudinal Isolation of Potent Near-Germline SARS-CoV-2-Neutralizing Antibodies from COVID-19 Patients.” Cell 182 (4): 843–54.
Robbiani, D. F., C. Gaebler, F. Muecksch, J. C. C. Lorenzi, Z. Wang, A. Cho, M. Agudelo, et al. 2020. Convergent antibody responses to SARS-CoV-2 in convalescent individuals.” Nature 584 (7821): 437–42.
Zost, S. J., P. Gilchuk, R. E. Chen, J. B. Case, J. X. Reidy, A. Trivette, R. S. Nargi, et al. 2020. Rapid isolation and profiling of a diverse panel of human monoclonal antibodies targeting the SARS-CoV-2 spike protein.” Nat Med 26 (9): 1422–27.